# 设置工作目录
library(DBI)
library(RMySQL)
setwd("D:/R语言商务数据分析实战-源代码和实验数据/R语言商务数据分析实战/第6章/任务程序")

library(stringr)

# -------------------------使用第三方平台信息构建新特征-------------------------

# 代码 6-5

df.master <- read.csv("./tmp/df_master.csv", stringsAsFactors = FALSE)
library(stringr)

# 第三方信息处理
colnames(df.master)
third.Info1 <- df.master[, 91:209]  # 提取第三方数据
third.Info_var <- apply(third.Info1, 2, var)  # 按列求方差
# 按降序排序
third.Info_var1 <- third.Info_var[order((third.Info_var), decreasing = TRUE)]
barplot(third.Info_var1[1:20], las = 2, col = rainbow(20), 
        width = 3, legend.text = FALSE)  # 绘制方差前20的柱状图
third.info <- matrix(nrow = 17, ncol = 7)
for (i in (1:17)) {
  for (j in (1:7)) {
    third.info[i, j] <- paste("ThirdParty_Info_Period", j, "_", i, sep = "")
  }
}
# 对ThirdParty_Infoi_j分别进行简单统计，如求最大值，最小值中位数和标准差
for (i in (1:17)) {
  colns <- paste("ThirdParty_Info_", i, "_max", sep = "")
  df.master[, colns] <- apply(df.master[, third.info[i, ]], 1, max)
}

for (i in (1:17)) {
  colns <- paste("ThirdParty_Info_", i, "_min", sep = "")
  df.master[, colns] <- apply(df.master[, third.info[i, ]], 1, min)
}
for (i in (1:17)) {
  colns <- paste("ThirdParty_Info_", i, "_median", sep = "")
  df.master[, colns] <- apply(df.master[, third.info[i, ]], 1, median)
}
for (i in (1:17)) {
  colns <- paste("ThirdParty_Info_", i, "_sd", sep = "")
  df.master[, colns] <- apply(df.master[, third.info[i, ]], 1, sd)
}

for (i in (1:7)) {
  colns <- paste("ThirdParty_Info_period", i, "_max", sep = "")
  df.master[, colns] <- apply(df.master[, third.info[, i]], 1, max)
}
for (i in (1:7)) {
  colns <- paste("ThirdParty_Info_period", i, "_min", sep = "")
  df.master[, colns] <- apply(df.master[, third.info[, i]], 1, min)
}
for (i in (1:7)) {
  colns <- paste("ThirdParty_Info_period", i, "_median", sep = "")
  df.master[, colns] <- apply(df.master[, third.info[, i]], 1, median)
}
for (i in (1:7)) {
  colns <- paste("ThirdParty_Info_period", i, "_sd", sep = "")
  df.master[, colns] <- apply(df.master[, third.info[, i]], 1, sd)
}
write.csv(df.master, "./tmp/df_master_clear.csv", row.names = FALSE)  # 写出数据




# --------------------对登陆信息表与更新信息表进行长宽表转换--------------------

# 代码 6-6

# 读取数据
df.tr.update <- read.csv("./data/Training_Userupdate.csv")
df.ts.update <- read.csv("./data/Test_Userupdate.csv")
df.tr.log <- read.csv("./data/Training_LogInfo.csv")
df.ts.log <- read.csv("./data/Test_LogInfo.csv")
# 合并数据
df.update <- rbind(df.tr.update, df.ts.update)
df.log <- rbind(df.tr.log, df.ts.log)

# 求出最早修改日期，最晚修改日期和放款日期之差。
# 将字段变成日期形式
df.log1 <- data.frame(df.log[, 1], as.Date(df.log[, 2]), as.Date(df.log[, 5]))
df.log1 <- unique(df.log1)  # 去掉重复字段
# 求出借款成交时间和登录时间的时间差
df.log2 <- data.frame(df.log1[, 1], df.log1[, 2] - df.log1[, 3])
colnames(df.log2) <- c("Idx", "date.diff")  # 对字段进行重命名，方便后期合表
df.fir.log <- aggregate(df.log2[, 2], list(df.log2[, 1]), max)  # 最早修改时间
colnames(df.fir.log) <- c("Idx", "first.log")  # 对字段进行重命名，方便后期合表
df.fir.log[, "first.log"] <- sapply(df.fir.log[, "first.log"], as.numeric)
df.last.log <- aggregate(df.log2[, 2], list(df.log2[, 1]), min)  # 最晚修改时间
colnames(df.last.log) <- c("Idx", "last.log")  # 对字段进行重命名，方便后期合表
df.last.log[, "last.log"] <- sapply(df.last.log[, "last.log"], as.numeric)
# 将操作类型和操作代码，以“,”分隔，保存在新增的log.full列
df.log[, "log.full"] <- as.data.frame(paste(df.log[, 3], df.log[, 4], sep = ","))

# 计算用户总的登录类型数目
df.log.cats <- as.data.frame(table(unique(df.log[, c(1, 6)])["Idx"]))
colnames(df.log.cats) <- c("Idx", "log.cats")  # 对表中字段进行重命名

# 对每种登录类型按Idx计数
df.log.type <- as.data.frame.matrix(xtabs( ~ Idx + log.full, df.log, sparse = TRUE))
df.log.type[, "Idx"] <- rownames(df.log.type)  # 增加Idx字段，方便后续合表

# 计算用户第一次登录之后的每一天登录平台的频率
df.log.1tab <- as.data.frame(table(df.log[, 1]))
df.log.fre <- data.frame(df.log.1tab[, 1], df.log.1tab[, 2] / 
                           as.data.frame(as.numeric(df.fir.log[, 2]) + 1))
colnames(df.log.fre) <- c("Idx", "log.frequency")

# 计算用户登录平台的天数
df.log.num <- table(unique(df.log[c("Idx", "LogInfo3")])$Idx)
df.log.num <- data.frame(df.log.num)
colnames(df.log.num) <- c("Idx", "lognum")

# 根据Idx字段合并表格
df.log.final <- merge(df.fir.log, df.last.log, by = "Idx")
df.log.final <- merge(df.log.final, df.log.cats, by = "Idx")
df.log.final <- merge(df.log.final, df.log.fre, by = "Idx")
df.log.final <- merge(df.log.final, df.log.type, by = "Idx")
df.log.final <- merge(df.log.final, df.log.num, by = "Idx")
write.csv(df.log.final, "./tmp/df_log_final.csv", row.names = FALSE)  # 写出数据

# Process UserUpdate
# 字符转换
# 将update数据框中的‘_’替换成为""
df.update[, 3] <- str_replace_all(df.update[, 3], "[_]", "")
df.update[, 3] <- str_to_lower(df.update[, 3])  # 将所有字母变成小写

# 对每种修改的特征按Idx计数
df.update1 <- as.data.frame.matrix(xtabs( ~ Idx + UserupdateInfo1, 
                                          df.update, sparse = TRUE))
# 计算平均每个特征用户会修改的频率
df.update1[, "updatecompl"] <- apply(df.update1, 1, sum) / length(df.update1)
Idx <- rownames(df.update1)
df.update1 <- data.frame(Idx, df.update1)  # 增加Idx字段，方便后续合表

df.update2 <- data.frame(df.update[, 1], as.Date(df.update[, 2]), 
                         as.Date(df.update[, 4]))  # 将字段变成日期形式
df.update2 <- unique(df.update2)
# 求出时间差
df.update3 <- data.frame(df.update2[, 1], df.update2[, 2] - df.update2[, 3])
colnames(df.update3) <- c("Idx", "date.diff")  # 对表中字段进行重命名
# 最早修改时间
df.fir.update <- aggregate(df.update3[, 2], list(df.update3[, 1]), max)
colnames(df.fir.update) <- c("Idx", "first.update")  # 对表中字段进行重命名
df.fir.update[, "first.update"] <- sapply(df.fir.update[, "first.update"], 
                                          as.numeric)
# 最晚修改时间
df.last.update <- aggregate(df.update3[, 2], list(df.update3[, 1]), min)
colnames(df.last.update) <- c("Idx", "last.update")  # 对表中字段进行重命名
df.last.update[, "last.update"] <- sapply(df.last.update[, "last.update"], 
                                          as.numeric)
# 计算用户更改特征数目
df.update.cats <- as.data.frame(table(unique(df.update[, c(1, 3)])["Idx"]))
colnames(df.update.cats) <- c("Idx", "update.cats")  # 对表中字段进行重命名

# 计算用户第一次更新信息之后的每一天更新信息的频率
df.update.1tab <- as.data.frame(table(df.update1[, 1]))
df.update.fre <- data.frame(df.update.1tab[, 1], df.update.1tab[, 2] / 
                              as.data.frame(as.numeric(df.fir.update[, 2]) + 1))
colnames(df.update.fre) <- c("Idx", "update.frequency")

# 计算用户更新信息的天数
df.update.num <- table(unique(df.update[c("Idx", "UserupdateInfo2")])$Idx)
df.update.num <- data.frame(df.update.num)
colnames(df.update.num) <- c("Idx", "update.num")

# 合并数据框
df.update.final <- merge(df.fir.update, df.last.update, by = "Idx")
df.update.final <- merge(df.update.final, df.update.cats, by = "Idx")
df.update.final <- merge(df.update.final, df.update.fre, by = "Idx")
df.update.final <- merge(df.update.final, df.update1, by = "Idx")
df.update.final <- merge(df.update.final, df.update.num, by = "Idx")
write.csv(df.update.final, "./tmp/df_update_final.csv", row.names = FALSE)




# --------------------------转换与清洗P2P信贷数据-------------------------------

# 代码 6-7

# 第一部分：针对类别型特征的处理
df.master <- read.csv("./tmp/df_master_clear.csv", stringsAsFactors = TRUE)

# 将列UserInfo_9的空格符去掉
df.master[, "UserInfo_9"] <- str_replace_all(df.master[, "UserInfo_9"], " ", "")
df.master[, "UserInfo_9"] <- as.factor(df.master[, "UserInfo_9"]) # 字符转换为因子
# 省份是否相同,相同为1，不同为0
df.master[, "diffprov"] <- as.integer(df.master["UserInfo_7"] == 
                                        df.master["UserInfo_19"])
# 当diffprov列为NA时，为-1
df.master[is.na(df.master["diffprov"]), "diffprov"] <- as.integer(-1)
# 字符处理
city <- c("UserInfo_2", "UserInfo_4", "UserInfo_8", "UserInfo_20")
for (i in (1:length(city))) {
  # 去掉市字
  df.master[, city[i]] <- str_replace_all(df.master[, city[i]], "市", "")
}

# 市是否相同
diff.cols <- c()
for (i in (1:(length(city) - 1))) {
  for (j in (length(city):(i + 1))) {
    tmp <- paste("UserInfodiff_", strsplit(city[i], "_")[[1]][2], "_", 
                 strsplit(city[j], "_")[[1]][2], sep = "")
    # 用UserInfodiff_1_2表示第1列和第2列的城市是否相同
    diff.cols <- c(diff.cols, tmp)  
    df.master[, tmp] = as.integer(df.master[, city[i]] == df.master[, city[j]])
  }
}
# 为空则赋值-1
df.master[, diff.cols][is.na(df.master[, diff.cols])] <- as.integer(-1)
prov <- c("UserInfo_7", "UserInfo_19")
# 用prov_i代替省名
for (i in (1:length(prov))) {
  df.master[, prov[i]] <- as.factor(df.master[, prov[i]])
  df.master[, prov[i]] <- as.integer(df.master[, prov[i]])
  df.master[, prov[i]] <- paste("prov", df.master[, prov[i]], sep = "_")
  df.master[, prov[i]] <- as.factor(df.master[, prov[i]])
}
# 用city_i代替市名
for (i in (1:length(city))) {
  df.master[, city[i]] <- as.factor(df.master[, city[i]])
  df.master[, city[i]] <- as.integer(df.master[, city[i]])
  df.master[, city[i]] <- paste("city", df.master[, city[i]], sep = "_")
  df.master[, city[i]] <- as.factor(df.master[, city[i]])
}
# 数据转换
# factor型特征
# 哑变量化函数
DummyFact <- function(x, a = 0.002) {
  # x为各特征的名称
  # a为默认值，指样本占总样本的百分比为0.002
  x <- as.character(x)
  tmp <- table(x) / length(x)  # 样本占总样本的百分比
  # 小数量样本占总数比率超过a，则合并为other
  if (sum(tmp[tmp < a]) > a) {
    tmp1 <- tmp[tmp < a]
    rowlist <- rep(F, length(x))
    for (i in (1:length(tmp1))) {
      rowlist <- (rowlist | (names(tmp1[i]) == x))
    }
    x[rowlist] <- "other"
  }
  # 缺失值占总数比率大于a，则为unknown
  rowlist <- is.na(x) | is.infinite(x)
  if (1 - sum(tmp) >= a) {
    x[rowlist] <- "unknow"
  }
  else {
    # 小于a，则不归为一类，赋值为众数
    x[rowlist] <- names(tmp[tmp == max(tmp)])
  }
  x <- as.factor(x)
}

# 哑变量化
library(caret)
isf <- as.logical(lapply(df.master[1, ], is.factor))
df.master.f <- df.master[, isf]
df.master.nf <- df.master[, !isf]
df.master.f <- apply(df.master.f, 2, DummyFact)
mainEffects <- dummyVars(~., data = df.master.f, sep = "_")
df.master.f <- predict(mainEffects, df.master.f)
df.master <- cbind(df.master.nf, df.master.f)





# 代码 6-8

# 第二部分：数值型数据的缺失值处理
df.update.final <- read.csv("./tmp/df_update_final.csv", 
                            stringsAsFactors = FALSE)
df.log.final <- read.csv("./tmp/df_log_final.csv", stringsAsFactors = FALSE)
# 缺失值统计
MissSum <- function(x) {
  a <- sum(is.na(x))  # x:向量
  return(a)
}
miss.sum.num <- sapply(df.master, MissSum)  # 统计各列缺失值个数
miss.rate <- miss.sum.num / length(df.master[, 1])
# 绘制特征缺失比率图
barplot(miss.rate[order(miss.rate, decreasing = TRUE)[1:5]], ylim = c(0, 1),
        main = "特征缺失比率图", xlab = "特征名称", ylab = "缺失比率")
df.master <- df.master[miss.rate < 0.965]  # 将缺失值大于0.965的列剔除

# 多重共线性处理，先处理缺失值，数值型缺失值填充为中位数
library(Hmisc)
num.miss.col <- sapply(df.master, anyNA)
num.miss.col <- as.data.frame(num.miss.col[num.miss.col == TRUE])
num.miss.col <- setdiff(row.names(num.miss.col), "target")
df.master[num.miss.col] <- impute(df.master[num.miss.col], median)  # 插补中位数





# 代码 6-9

# 第三部分：筛选冗余特征
inval.cols <- c("Idx", "target")
line.cols <- names(df.master)
line.cols <- setdiff(line.cols, inval.cols)
df.master[, line.cols][is.na(df.master[, line.cols])] <- as.integer(-1)
df.cor <- cor(df.master[,line.cols])
df.len <- length(df.cor[1, ])
cor.col <- c(1:df.len)
# 将相关系数大于0.99的列剔除
for (i in (1:(df.len - 1))) {
  for (j in c((i + 1):df.len)) {
    if (!is.na(df.cor[i, j])) {
      if (abs(df.cor[i, j]) > 0.99) {
        tmp <- which(cor.col == j)
        if (length(tmp) > 0) {
          cor.col <- cor.col[-tmp]
        }
      }
    }
  }
}
df.master <- df.master[c("Idx", line.cols[cor.col], "target")]
# 合并数据
df.final <- merge(df.update.final, df.log.final, by = "Idx", all = TRUE)
df.final <- merge(df.final, df.master, by = "Idx", all = TRUE)

# log和update两张表数据缺失值填充为0
inval.cols <- c("Idx", "target")
log.up.col <- c(colnames(df.update.final)[-1], colnames(df.log.final)[-1])
mat <- is.na(df.final[, log.up.col])
int0 <- as.integer(0)
for (i in (1:length(mat[1, ]))) {
  df.final[mat[, i], log.up.col[i]] = int0
}

# 同一值去字段
SmVal <- function(x) {
  # x:vector
  valtlb <- table(x) / length(x)
  if (sum(valtlb[valtlb > 0.99]) != 0)
    return(F)
  else
    return(T)
}
cols <- apply(df.final, 2, SmVal)
df.final <- df.final[, cols]

# 变异系数去字段
Csd <- function(x) {
  # x:vector
  a <- sd(x, na.rm = TRUE) / mean(x, na.rm = TRUE)
  return(a)
}
cols <- sapply(df.final, is.numeric)
cols[inval.cols] <- F
# 求出数值型字段标准差，要求不包含缺失值
cov.num.cols <- lapply(df.final[cols], Csd)
df.final <- df.final[cov.num.cols >= 0.1]  # 剔除变异系数小于0.1的字段

write.csv(df.final, "./tmp/df_final.csv", row.names = FALSE)
